rm(list=ls())
library(plyr)
library(lubridate)
library(dplyr)
library(ggplot2)
library(ROCR)
library(ggmap)
library(rpart)
library(curl)
library(jsonlite)
library(broom)
library(rgdal)
library(scales)
library(rpart.plot)
library(RColorBrewer)
library(randomForest)
library(ggRandomForests)

Graph Injuries

#d.in <- read.csv('~/Desktop/visionzero/data/final/injury_monthly_final.csv', header = TRUE)
d.in <- read.csv('~/Desktop/data_analysis/project/visionzero/data/final/injury_monthly_final.csv', header = TRUE)

#Filter out intersections w/o injuries
d.in <- d.in %>%
  filter(MN != 0)


#All Injuries
g <- ggplot(data = d.in, aes(x=month(MN,label=TRUE,abbr=TRUE), y=Injuries_COUNT)) + 
  geom_bar(stat="identity", fill="steelblue") + 
  labs(x = "Month", y = "Injuries") + 
  ggtitle("Monthly Injuries (2016)") 

plot(g)

#Pedestrian Injuries
g <- ggplot(data = d.in, aes(x=month(MN,label=TRUE,abbr=TRUE), y=PedInjurie_COUNT)) + 
  geom_bar(stat="identity", fill="steelblue")  + 
  labs(x = "Month", y = "Injuries") + 
  ggtitle("Pedestrian Injuries (2016)") 

plot(g)

#Bike Injuries
g <- ggplot(data = d.in, aes(x=month(MN,label=TRUE,abbr=TRUE), y=BikeInjuri_COUNT)) + 
  geom_bar(stat="identity", fill="steelblue") + 
  labs(x = "Month", y = "Injuries") + 
  ggtitle("Bike Injuries (2016)") 

plot(g) 

#Motor Vehicle Injuries
g <- ggplot(data = d.in, aes(x=month(MN,label=TRUE,abbr=TRUE), y=MVOInjurie_COUNT)) + 
  geom_bar(stat="identity", fill="steelblue") + 
  labs(x = "Month", y = "Injuries") + 
  ggtitle("Motor Vehicle Injuries (2016)") 

plot(g)

Plot injuries to map

d.in <- read.csv('~/Desktop/data_analysis/project/visionzero/data/final/injury_monthly_final.csv', header = TRUE)
#d.in <- read.csv('~/Desktop/visionzero/data/final/injury_monthly_final.csv', header = TRUE)

# Set a base map
nyc <- get_map(location = c(lon = -73.935, lat = 40.712), zoom = 11)
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=40.712,-73.935&zoom=11&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
#Bike injuries by month
bike_inj <- d.in %>% filter(Injuries == 1 & BikeInjuri == 1)

g <- ggmap(nyc) + geom_point(data = bike_inj, aes(x = Lon, y = Lat, fill=month(MN,label=TRUE,abbr=TRUE)),size = 1, shape = 21)  + ggtitle("Bike Injuries (2016)") 

plot(g)
## Warning: Removed 51 rows containing missing values (geom_point).

#bike_zones <- readOGR("http://www.nyc.gov/html/dot/downloads/misc/bike_priority_districts.json")
#g <- ggmap(nyc) + ggmap(bike_zones)
#multiplot(g, )

#Pedestrian injuries by month
ped_inj <- d.in %>% filter(Injuries == 1 & PedInjurie == 1)

g <- ggmap(nyc) + geom_point(data = ped_inj, aes(x = Lon, y = Lat, fill=month(MN,label=TRUE,abbr=TRUE)),size = 1, shape = 21) + ggtitle("Pedestrian Injuries (2016)") 

plot(g)
## Warning: Removed 274 rows containing missing values (geom_point).

#MVO injuries by month
mvo_inj <- d.in %>% filter(Injuries == 1 & MVOInjurie == 1)

g <- ggmap(nyc) + geom_point(data = mvo_inj, aes(x = Lon, y = Lat, fill=month(MN,label=TRUE,abbr=TRUE)),size = 1, shape = 21) + ggtitle("Motor Vehicle Injuries (2016)") 

plot(g)
## Warning: Removed 908 rows containing missing values (geom_point).

Format variables

#format dataset - turn variables into factors
d.in <- read.csv('~/Desktop/data_analysis/project/visionzero/data/final/injury_monthly_final.csv', header = TRUE)
#d.in <- read.csv('~/Desktop/visionzero/data/final/injury_monthly_final.csv', header = TRUE)

#transform variables into factors
d.in <- d.in %>%
  mutate(Injuries = as.factor(Injuries),
         PedInjurie = as.factor(PedInjurie),
         BikeInjuri = as.factor(BikeInjuri),
         MVOInjurie = as.factor(MVOInjurie),
         bike_priority_districts = as.factor(bike_priority_districts),
         enhanced_crossings = as.factor(enhanced_crossings),
         left_turn_traffic_calming = as.factor(left_turn_traffic_calming),
         neighborhood_slow_zones = as.factor(neighborhood_slow_zones), 
         leading_pedestrian_interval_signals = as.factor(leading_pedestrian_interval_signals), 
         signal_timing = as.factor(signal_timing), 
         speed_humps = as.factor(speed_humps), 
         safe_streets_for_seniors = as.factor(safe_streets_for_seniors),
         street_improvement_projects_corridors = as.factor(street_improvement_projects_corridors),
         vz_priority_corridors = as.factor(vz_priority_corridors),
         vz_priority_intersections = as.factor(vz_priority_intersections),
         arterial_slow_zones = as.factor(arterial_slow_zones), 
         MN = as.factor(MN),
         street_improvement_projects_corridors = as.factor(street_improvement_projects_corridors),
         vz_priority_zones = as.factor(vz_priority_zones))

Multiple Logistic Regressions

#GLM - AllInjures (minus VZ & Bike Zone)
# VZ and Bike Zones are not street attributes - just known problem areas

injury_monthly.glm <- glm(Injuries ~ enhanced_crossings + speed_humps + left_turn_traffic_calming + arterial_slow_zones + leading_pedestrian_interval_signals + signal_timing + safe_streets_for_seniors + street_improvement_projects_corridors + neighborhood_slow_zones, family = "binomial", data = d.in)

summary(injury_monthly.glm)
## 
## Call:
## glm(formula = Injuries ~ enhanced_crossings + speed_humps + left_turn_traffic_calming + 
##     arterial_slow_zones + leading_pedestrian_interval_signals + 
##     signal_timing + safe_streets_for_seniors + street_improvement_projects_corridors + 
##     neighborhood_slow_zones, family = "binomial", data = d.in)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.6154  -0.4972  -0.4972  -0.4972   2.1272  
## 
## Coefficients:
##                                        Estimate Std. Error  z value
## (Intercept)                            -2.02827    0.00928 -218.568
## enhanced_crossings1                     0.41391    0.20585    2.011
## speed_humps1                            1.01748    0.02673   38.070
## left_turn_traffic_calming1              3.83208    0.22995   16.665
## arterial_slow_zones1                    0.23084    0.03411    6.766
## leading_pedestrian_interval_signals1    2.59419    0.04496   57.694
## signal_timing1                          1.57612    0.02827   55.755
## safe_streets_for_seniors1               0.55982    0.02347   23.849
## street_improvement_projects_corridors1  1.45912    0.02212   65.955
## neighborhood_slow_zones1               -0.12423    0.04753   -2.614
##                                        Pr(>|z|)    
## (Intercept)                             < 2e-16 ***
## enhanced_crossings1                     0.04435 *  
## speed_humps1                            < 2e-16 ***
## left_turn_traffic_calming1              < 2e-16 ***
## arterial_slow_zones1                   1.32e-11 ***
## leading_pedestrian_interval_signals1    < 2e-16 ***
## signal_timing1                          < 2e-16 ***
## safe_streets_for_seniors1               < 2e-16 ***
## street_improvement_projects_corridors1  < 2e-16 ***
## neighborhood_slow_zones1                0.00896 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 139436  on 140900  degrees of freedom
## Residual deviance: 114384  on 140891  degrees of freedom
## AIC: 114404
## 
## Number of Fisher Scoring iterations: 6
exp(coef((injury_monthly.glm)))
##                            (Intercept) 
##                              0.1315631 
##                    enhanced_crossings1 
##                              1.5127275 
##                           speed_humps1 
##                              2.7662042 
##             left_turn_traffic_calming1 
##                             46.1585062 
##                   arterial_slow_zones1 
##                              1.2596567 
##   leading_pedestrian_interval_signals1 
##                             13.3857050 
##                         signal_timing1 
##                              4.8361757 
##              safe_streets_for_seniors1 
##                              1.7503509 
## street_improvement_projects_corridors1 
##                              4.3021682 
##               neighborhood_slow_zones1 
##                              0.8831721
exp(confint((injury_monthly.glm)))
## Waiting for profiling to be done...
##                                             2.5 %     97.5 %
## (Intercept)                             0.1291867  0.1339726
## enhanced_crossings1                     1.0047420  2.2559794
## speed_humps1                            2.6247880  2.9147078
## left_turn_traffic_calming1             30.1608649 74.6309713
## arterial_slow_zones1                    1.1779719  1.3465245
## leading_pedestrian_interval_signals1   12.2640092 14.6281904
## signal_timing1                          4.5756508  5.1118613
## safe_streets_for_seniors1               1.6714930  1.8325959
## street_improvement_projects_corridors1  4.1195485  4.4927555
## neighborhood_slow_zones1                0.8041116  0.9688283
#build table to plot Odds Ratios
a <- data.frame(exp(coef((injury_monthly.glm))))

a <- cbind(attribute = rownames(a), a)
rownames(a) <- 1:nrow(a)

b <- data.frame(exp(confint((injury_monthly.glm))))
## Waiting for profiling to be done...
b <- cbind(attribute = rownames(b), b)
rownames(b) <- 1:nrow(b)

coef_df <- left_join(a, b, by = "attribute")
colnames(coef_df)[2] <- "coefficient"
colnames(coef_df)[3] <- "lower_bound"
colnames(coef_df)[4] <- "upper_bound"

g <- ggplot(coef_df, aes(x = attribute, y = coefficient)) +
  geom_point(size = 2) +
  geom_errorbar(aes(ymax = upper_bound, ymin = lower_bound), width=0.1, color="darkred") + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
  geom_text(data = coef_df, aes(label = round(coefficient, digits=3)), hjust = 1.2) +
  ggtitle("Injuries - Coefficients & Confidence Intervals")

plot(g)

#GLM - Pedestrian Injuries
ped_injury <- glm(PedInjurie ~ arterial_slow_zones + enhanced_crossings + speed_humps + left_turn_traffic_calming + leading_pedestrian_interval_signals + street_improvement_projects_corridors + neighborhood_slow_zones, family = "binomial", data = d.in)

summary(ped_injury)
## 
## Call:
## glm(formula = PedInjurie ~ arterial_slow_zones + enhanced_crossings + 
##     speed_humps + left_turn_traffic_calming + leading_pedestrian_interval_signals + 
##     street_improvement_projects_corridors + neighborhood_slow_zones, 
##     family = "binomial", data = d.in)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2715  -0.2758  -0.2758  -0.2758   2.5645  
## 
## Coefficients:
##                                        Estimate Std. Error  z value
## (Intercept)                            -3.25040    0.01489 -218.262
## arterial_slow_zones1                    0.60064    0.04099   14.653
## enhanced_crossings1                     0.12559    0.31186    0.403
## speed_humps1                            0.75234    0.03791   19.844
## left_turn_traffic_calming1              1.57088    0.09052   17.354
## leading_pedestrian_interval_signals1    1.63842    0.03630   45.141
## street_improvement_projects_corridors1  1.18903    0.02867   41.467
## neighborhood_slow_zones1                0.07970    0.06747    1.181
##                                        Pr(>|z|)    
## (Intercept)                              <2e-16 ***
## arterial_slow_zones1                     <2e-16 ***
## enhanced_crossings1                       0.687    
## speed_humps1                             <2e-16 ***
## left_turn_traffic_calming1               <2e-16 ***
## leading_pedestrian_interval_signals1     <2e-16 ***
## street_improvement_projects_corridors1   <2e-16 ***
## neighborhood_slow_zones1                  0.237    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 62859  on 140900  degrees of freedom
## Residual deviance: 56468  on 140893  degrees of freedom
## AIC: 56484
## 
## Number of Fisher Scoring iterations: 6
exp(coef(ped_injury))
##                            (Intercept) 
##                             0.03875886 
##                   arterial_slow_zones1 
##                             1.82328425 
##                    enhanced_crossings1 
##                             1.13381760 
##                           speed_humps1 
##                             2.12196902 
##             left_turn_traffic_calming1 
##                             4.81087968 
##   leading_pedestrian_interval_signals1 
##                             5.14702811 
## street_improvement_projects_corridors1 
##                             3.28389579 
##               neighborhood_slow_zones1 
##                             1.08296302
exp(confint(ped_injury))
## Waiting for profiling to be done...
##                                             2.5 %   97.5 %
## (Intercept)                            0.03763932 0.039902
## arterial_slow_zones1                   1.68170429 1.974872
## enhanced_crossings1                    0.58471778 2.006762
## speed_humps1                           1.96915098 2.284690
## left_turn_traffic_calming1             4.02698873 5.742805
## leading_pedestrian_interval_signals1   4.79263317 5.525446
## street_improvement_projects_corridors1 3.10392500 3.473182
## neighborhood_slow_zones1               0.94709539 1.233903
#GLM - Pedestrian Injuries
#build table to plot Odds Ratios
a <- data.frame(exp(coef((ped_injury))))

a <- cbind(attribute = rownames(a), a)
rownames(a) <- 1:nrow(a)

b <- data.frame(exp(confint((ped_injury))))
## Waiting for profiling to be done...
b <- cbind(attribute = rownames(b), b)
rownames(b) <- 1:nrow(b)

coef_df <- left_join(a, b, by = "attribute")
colnames(coef_df)[2] <- "coefficient"
colnames(coef_df)[3] <- "lower_bound"
colnames(coef_df)[4] <- "upper_bound"

g <- ggplot(coef_df, aes(x = attribute, y = coefficient)) +
  geom_point(size = 2) +
  geom_errorbar(aes(ymax = upper_bound, ymin = lower_bound), width=0.1, color="darkred") + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
  geom_text(data = coef_df, aes(label = round(coefficient, digits=3)), hjust = 1.2) +
  ggtitle("Pedestrian Injuries - Coefficients & Confidence Intervals")

plot(g)

#GLM - Bike Injuries
bike_injury <- glm(BikeInjuri ~ bike_priority_districts + enhanced_crossings + speed_humps + left_turn_traffic_calming + arterial_slow_zones + signal_timing + street_improvement_projects_corridors + neighborhood_slow_zones, family = "binomial", data = d.in)

summary(bike_injury)
## 
## Call:
## glm(formula = BikeInjuri ~ bike_priority_districts + enhanced_crossings + 
##     speed_humps + left_turn_traffic_calming + arterial_slow_zones + 
##     signal_timing + street_improvement_projects_corridors + neighborhood_slow_zones, 
##     family = "binomial", data = d.in)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.4369  -0.2289  -0.1702  -0.1702   2.9127  
## 
## Coefficients:
##                                        Estimate Std. Error  z value
## (Intercept)                            -4.22758    0.02403 -175.962
## bike_priority_districts1                0.59892    0.04195   14.279
## enhanced_crossings1                     0.78188    0.33793    2.314
## speed_humps1                            0.65842    0.05399   12.196
## left_turn_traffic_calming1              1.11028    0.11259    9.861
## arterial_slow_zones1                    0.26176    0.06044    4.331
## signal_timing1                          0.99781    0.04840   20.614
## street_improvement_projects_corridors1  1.19253    0.04184   28.499
## neighborhood_slow_zones1                0.44511    0.08568    5.195
##                                        Pr(>|z|)    
## (Intercept)                             < 2e-16 ***
## bike_priority_districts1                < 2e-16 ***
## enhanced_crossings1                      0.0207 *  
## speed_humps1                            < 2e-16 ***
## left_turn_traffic_calming1              < 2e-16 ***
## arterial_slow_zones1                   1.49e-05 ***
## signal_timing1                          < 2e-16 ***
## street_improvement_projects_corridors1  < 2e-16 ***
## neighborhood_slow_zones1               2.04e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 33168  on 140900  degrees of freedom
## Residual deviance: 30500  on 140892  degrees of freedom
## AIC: 30518
## 
## Number of Fisher Scoring iterations: 7
exp(coef(bike_injury))
##                            (Intercept) 
##                             0.01458759 
##               bike_priority_districts1 
##                             1.82015475 
##                    enhanced_crossings1 
##                             2.18557277 
##                           speed_humps1 
##                             1.93173839 
##             left_turn_traffic_calming1 
##                             3.03520294 
##                   arterial_slow_zones1 
##                             1.29921385 
##                         signal_timing1 
##                             2.71234493 
## street_improvement_projects_corridors1 
##                             3.29540128 
##               neighborhood_slow_zones1 
##                             1.56066845
exp(confint(bike_injury))
## Waiting for profiling to be done...
##                                             2.5 %     97.5 %
## (Intercept)                            0.01391226 0.01528623
## bike_priority_districts1               1.67560740 1.97510179
## enhanced_crossings1                    1.05791075 4.03440590
## speed_humps1                           1.73593167 2.14515786
## left_turn_traffic_calming1             2.42455602 3.77097376
## arterial_slow_zones1                   1.15280721 1.46106672
## signal_timing1                         2.46583508 2.98107918
## street_improvement_projects_corridors1 3.03497519 3.57598794
## neighborhood_slow_zones1               1.31524904 1.84050982
#build table to plot Odds Ratios
a <- data.frame(exp(coef((bike_injury))))

a <- cbind(attribute = rownames(a), a)
rownames(a) <- 1:nrow(a)

b <- data.frame(exp(confint((bike_injury))))
## Waiting for profiling to be done...
b <- cbind(attribute = rownames(b), b)
rownames(b) <- 1:nrow(b)

coef_df <- left_join(a, b, by = "attribute")
colnames(coef_df)[2] <- "coefficient"
colnames(coef_df)[3] <- "lower_bound"
colnames(coef_df)[4] <- "upper_bound"

g <- ggplot(coef_df, aes(x = attribute, y = coefficient)) +
  geom_point(size = 2) +
  geom_errorbar(aes(ymax = upper_bound, ymin = lower_bound), width=0.1, color="darkred") + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
  geom_text(data = coef_df, aes(label = round(coefficient, digits=3)), hjust = 1.2) +
  ggtitle("Bike Injuries - Coefficients & Confidence Intervals")

plot(g)

#GLM - MVO Injuries
mvo_injury <- glm(MVOInjurie ~ arterial_slow_zones + enhanced_crossings + speed_humps + left_turn_traffic_calming + street_improvement_projects_corridors + neighborhood_slow_zones, family = "binomial", data = d.in)

summary(mvo_injury)
## 
## Call:
## glm(formula = MVOInjurie ~ arterial_slow_zones + enhanced_crossings + 
##     speed_humps + left_turn_traffic_calming + street_improvement_projects_corridors + 
##     neighborhood_slow_zones, family = "binomial", data = d.in)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9923  -0.4408  -0.4408  -0.4408   2.3450  
## 
## Coefficients:
##                                         Estimate Std. Error  z value
## (Intercept)                            -2.282396   0.009952 -229.350
## arterial_slow_zones1                    0.692499   0.031505   21.981
## enhanced_crossings1                    -0.208199   0.243587   -0.855
## speed_humps1                            0.795230   0.028765   27.645
## left_turn_traffic_calming1              1.194577   0.085184   14.024
## street_improvement_projects_corridors1  1.436879   0.021219   67.718
## neighborhood_slow_zones1               -0.400998   0.056930   -7.044
##                                        Pr(>|z|)    
## (Intercept)                             < 2e-16 ***
## arterial_slow_zones1                    < 2e-16 ***
## enhanced_crossings1                       0.393    
## speed_humps1                            < 2e-16 ***
## left_turn_traffic_calming1              < 2e-16 ***
## street_improvement_projects_corridors1  < 2e-16 ***
## neighborhood_slow_zones1               1.87e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 105164  on 140900  degrees of freedom
## Residual deviance:  98877  on 140894  degrees of freedom
## AIC: 98891
## 
## Number of Fisher Scoring iterations: 5
exp(coef(mvo_injury))
##                            (Intercept) 
##                              0.1020394 
##                   arterial_slow_zones1 
##                              1.9987036 
##                    enhanced_crossings1 
##                              0.8120452 
##                           speed_humps1 
##                              2.2149506 
##             left_turn_traffic_calming1 
##                              3.3021601 
## street_improvement_projects_corridors1 
##                              4.2075444 
##               neighborhood_slow_zones1 
##                              0.6696513
exp(confint(mvo_injury))
## Waiting for profiling to be done...
##                                            2.5 %    97.5 %
## (Intercept)                            0.1000634 0.1040440
## arterial_slow_zones1                   1.8785837 2.1255291
## enhanced_crossings1                    0.4931631 1.2868057
## speed_humps1                           2.0931148 2.3429646
## left_turn_traffic_calming1             2.7934596 3.9013077
## street_improvement_projects_corridors1 4.0359406 4.3859968
## neighborhood_slow_zones1               0.5981811 0.7477761
#build table to plot Odds Ratios
a <- data.frame(exp(coef((mvo_injury))))

a <- cbind(attribute = rownames(a), a)
rownames(a) <- 1:nrow(a)

b <- data.frame(exp(confint((mvo_injury))))
## Waiting for profiling to be done...
b <- cbind(attribute = rownames(b), b)
rownames(b) <- 1:nrow(b)

coef_df <- left_join(a, b, by = "attribute")
colnames(coef_df)[2] <- "coefficient"
colnames(coef_df)[3] <- "lower_bound"
colnames(coef_df)[4] <- "upper_bound"

g <- ggplot(coef_df, aes(x = attribute, y = coefficient)) +
  geom_point(size = 2) +
  geom_errorbar(aes(ymax = upper_bound, ymin = lower_bound), width=0.1, color="darkred") + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
  geom_text(data = coef_df, aes(label = round(coefficient, digits=3)), hjust = 1.2) +
  ggtitle("Motor Vehicle Injuries - Coefficients & Confidence Intervals")

plot(g)

Build Predictive Model

#GENERAL INJURY 
#split data 70-30

set.seed(123)  
t_index <- sample(seq_len(nrow(d.in)), size = floor(.70*nrow(d.in)), replace = F)

d.train <- d.in[t_index, ]
d.test <- d.in[-t_index, ] 

#train logistic regression take 1 (all)
m.log <- glm(Injuries ~ bike_priority_districts + vz_priority_corridors + vz_priority_intersections + vz_priority_zones + enhanced_crossings + speed_humps + left_turn_traffic_calming + arterial_slow_zones + leading_pedestrian_interval_signals + signal_timing + safe_streets_for_seniors + street_improvement_projects_corridors, data = d.train, family = "binomial")

 #bike_priority_districts + vz_priority_corridors + vz_priority_intersections + vz_priority_zones +
#train logistic regression take 2 (only real safety features)
#m.log <- glm(Injuries ~ enhanced_crossings + speed_humps + left_turn_traffic_calming + arterial_slow_zones + leading_pedestrian_interval_signals + signal_timing + safe_streets_for_seniors + street_improvement_projects_corridors + neighborhood_slow_zones, data = d.train, family = "binomial")

#train logistic regression take 3 (only problem zones)
#m.log <- glm(Injuries ~ bike_priority_districts + vz_priority_corridors + vz_priority_intersections + vz_priority_zones, data = d.train, family = "binomial")

#predict injury
d.test$Predicted_Injury <- predict(m.log, newdata = d.test, type = "response")

#optimal cutoff seems to be 0.1
d.test$Predicted_Injury <- ifelse(d.test$Predicted_Injury >= 0.1,
                                   1,
                                   0) 

# TP
tp <- d.test %>%
  filter(Injuries == 1 & Predicted_Injury == 1) %>% nrow()
# TN
tn <- d.test %>%
  filter(Injuries == 0 & Predicted_Injury == 0) %>% nrow()
# FP
fp <- d.test %>%
  filter(Injuries == 0 & Predicted_Injury == 1) %>% nrow()
# FN
fn <- d.test %>%
  filter(Injuries == 1 & Predicted_Injury == 0) %>% nrow()

# Sensitivity (or TPR)
sens <- tp/(tp+fn)

# Specificity ( 1 - FPR)
spec <- 1 - (tn/(tn+fp))

# Accuracy
acc <- (fp+tp)/(tp+tn+fp+fn)

#find AUC
pred <- prediction(predictions = d.test$Predicted_Injury, labels = d.test$Injuries)
roc.perf <- performance(pred, measure = "tpr", x.measure = "fpr")

plot(roc.perf)

auc.perf <- performance(pred, measure = "auc")
auc.perf@y.values
## [[1]]
## [1] 0.7053684

Decision Tree

#Build Decision Tree
mtree.1 <- rpart(Injuries ~ enhanced_crossings + speed_humps + left_turn_traffic_calming + arterial_slow_zones + leading_pedestrian_interval_signals + signal_timing + safe_streets_for_seniors + street_improvement_projects_corridors, data=d.train, method="class")

rpart.plot(mtree.1)

#Plot variables of importance
d.var_imp <-data.frame(mtree.1$variable.importance)
names(d.var_imp) <- "importance"
d.var_imp$variable <-as.factor(rownames(d.var_imp))
d.var_imp <-transform(d.var_imp, variable=reorder(variable, -importance) )

filt5 <- d.var_imp %>% top_n(-5)
## Selecting by variable
g <-ggplot(filt5,aes(x=variable, y=importance)) + 
  geom_bar(stat="identity") + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1))
plot(g) 

Random Forest

#Generate RF
mrf.1 <- randomForest(Injuries ~ enhanced_crossings + speed_humps + left_turn_traffic_calming + arterial_slow_zones + leading_pedestrian_interval_signals + signal_timing + safe_streets_for_seniors + street_improvement_projects_corridors, data=d.train, importance = TRUE, ntree=1000)

#plot variables of importance
varImpPlot(mrf.1)

#plot OOB error
mrf.1$importance
##                                                   0            1
## enhanced_crossings                     3.297610e-05 0.0000206177
## speed_humps                           -6.421188e-05 0.0040348501
## left_turn_traffic_calming              2.673379e-03 0.0029948626
## arterial_slow_zones                   -1.997459e-04 0.0044761049
## leading_pedestrian_interval_signals    2.398093e-02 0.0465890394
## signal_timing                          2.736861e-02 0.0644103343
## safe_streets_for_seniors              -6.891478e-04 0.0065085950
## street_improvement_projects_corridors  4.464614e-03 0.0314366349
##                                       MeanDecreaseAccuracy
## enhanced_crossings                            3.055889e-05
## speed_humps                                   7.360478e-04
## left_turn_traffic_calming                     2.736216e-03
## arterial_slow_zones                           7.138060e-04
## leading_pedestrian_interval_signals           2.839536e-02
## signal_timing                                 3.460155e-02
## safe_streets_for_seniors                      7.150871e-04
## street_improvement_projects_corridors         9.727956e-03
##                                       MeanDecreaseGini
## enhanced_crossings                            4.404694
## speed_humps                                 257.116457
## left_turn_traffic_calming                   191.690280
## arterial_slow_zones                         142.394249
## leading_pedestrian_interval_signals        1900.152241
## signal_timing                              1593.838852
## safe_streets_for_seniors                    247.008181
## street_improvement_projects_corridors      1616.682815
x <- gg_error(mrf.1)
plot(x)

#predict diagnosis (tree)
d.test$Predicted_Injury <- predict(mtree.1, d.test, type="class")

#form prediction object
tree_pred <- prediction(predictions=as.numeric(d.test$Predicted_Injury), labels =as.numeric(d.test$Injuries))

#calculate AUC
auc.perf <- performance(tree_pred, measure = "auc")
tree_auc <- auc.perf@y.values

#predict diagnosis (forest)
d.test$Predicted_Injury <- predict(mrf.1, d.test, type="class")

#form prediction object (forest)
rf_pred <-prediction(predictions=as.numeric(d.test$Predicted_Injury), labels =as.numeric(d.test$Injuries))

#calculate auc
auc.perf <- performance(rf_pred, measure = "auc")
rf_auc <- auc.perf@y.values

tree_auc
## [[1]]
## [1] 0.6380118
rf_auc
## [[1]]
## [1] 0.6561914

Pedestrian Injury Predictions

#PEDESTRIAN INJURY 
set.seed(123)  
t_index <- sample(seq_len(nrow(d.in)), size = floor(.70*nrow(d.in)), replace = F)

d.train <- d.in[t_index, ]
d.test <- d.in[-t_index, ] 

#train logistic regression 
m.log <- glm(PedInjurie ~ arterial_slow_zones + enhanced_crossings + speed_humps + left_turn_traffic_calming + leading_pedestrian_interval_signals + street_improvement_projects_corridors + neighborhood_slow_zones, family = "binomial", data = d.test)


#predict injury
d.test$Predicted_Injury <- predict(m.log, newdata = d.test, type = "response")

#optimal cutoff seems to be 0.1
d.test$Predicted_Injury <- ifelse(d.test$Predicted_Injury >= 0.05,
                                   1,
                                   0) 

# TP
tp <- d.test %>%
  filter(Injuries == 1 & Predicted_Injury == 1) %>% nrow()
# TN
tn <- d.test %>%
  filter(Injuries == 0 & Predicted_Injury == 0) %>% nrow()
# FP
fp <- d.test %>%
  filter(Injuries == 0 & Predicted_Injury == 1) %>% nrow()
# FN
fn <- d.test %>%
  filter(Injuries == 1 & Predicted_Injury == 0) %>% nrow()

# Sensitivity (or TPR)
sens <- tp/(tp+fn)

# Specificity ( 1 - FPR)
spec <- tn/(tn+fp)

# Accuracy
acc <- (fp+tp)/(tp+tn+fp+fn)

# FPR (1- Specificity)
1 - (tn/(tn+fp))
## [1] 0.1222029
# FNR
fn/(fn+tp)
## [1] 0.5208333
#find AUC
pred <- prediction(predictions = d.test$Predicted_Injury, labels = d.test$Injuries)
roc.perf <- performance(pred, measure = "tpr", x.measure = "fpr")

plot(roc.perf)

auc.perf <- performance(pred, measure = "auc")
auc.perf@y.values
## [[1]]
## [1] 0.6784819

Bike Injury

#BIKE INJURY 
set.seed(123)  
t_index <- sample(seq_len(nrow(d.in)), size = floor(.70*nrow(d.in)), replace = F)

d.train <- d.in[t_index, ]
d.test <- d.in[-t_index, ] 

#train logistic regression 
m.log <- glm(BikeInjuri ~ enhanced_crossings + speed_humps + left_turn_traffic_calming + arterial_slow_zones + signal_timing + street_improvement_projects_corridors + neighborhood_slow_zones, family = "binomial", data = d.test)


#predict injury
d.test$Predicted_Injury <- predict(m.log, newdata = d.test, type = "response")

#optimal cutoff seems to be 0.1
d.test$Predicted_Injury <- ifelse(d.test$Predicted_Injury >= 0.03,
                                   1,
                                   0) 

# TP
tp <- d.test %>%
  filter(Injuries == 1 & Predicted_Injury == 1) %>% nrow()
# TN
tn <- d.test %>%
  filter(Injuries == 0 & Predicted_Injury == 0) %>% nrow()
# FP
fp <- d.test %>%
  filter(Injuries == 0 & Predicted_Injury == 1) %>% nrow()
# FN
fn <- d.test %>%
  filter(Injuries == 1 & Predicted_Injury == 0) %>% nrow()

# Sensitivity (or TPR)
sens <- tp/(tp+fn)

# Specificity ( 1 - FPR)
spec <- tn/(tn+fp)

# Accuracy
acc <- (fp+tp)/(tp+tn+fp+fn)

# FPR (1- Specificity)
1 - (tn/(tn+fp))
## [1] 0.1069607
# FNR
fn/(fn+tp)
## [1] 0.5267002
#find AUC
pred <- prediction(predictions = d.test$Predicted_Injury, labels = d.test$Injuries)
roc.perf <- performance(pred, measure = "tpr", x.measure = "fpr")

plot(roc.perf)

auc.perf <- performance(pred, measure = "auc")
auc.perf@y.values
## [[1]]
## [1] 0.6831696

MVO Injury

#MVO INJURY 
set.seed(123)  
t_index <- sample(seq_len(nrow(d.in)), size = floor(.70*nrow(d.in)), replace = F)

d.train <- d.in[t_index, ]
d.test <- d.in[-t_index, ] 

#train logistic regression 
m.log <- glm(MVOInjurie ~ arterial_slow_zones + enhanced_crossings + speed_humps + left_turn_traffic_calming + street_improvement_projects_corridors + neighborhood_slow_zones, family = "binomial", data = d.test)


#predict injury
d.test$Predicted_Injury <- predict(m.log, newdata = d.test, type = "response")

#optimal cutoff seems to be 0.1
d.test$Predicted_Injury <- ifelse(d.test$Predicted_Injury >= 0.1,
                                   1,
                                   0) 

# TP
tp <- d.test %>%
  filter(Injuries == 1 & Predicted_Injury == 1) %>% nrow()
# TN
tn <- d.test %>%
  filter(Injuries == 0 & Predicted_Injury == 0) %>% nrow()
# FP
fp <- d.test %>%
  filter(Injuries == 0 & Predicted_Injury == 1) %>% nrow()
# FN
fn <- d.test %>%
  filter(Injuries == 1 & Predicted_Injury == 0) %>% nrow()

# Sensitivity (or TPR)
sens <- tp/(tp+fn)

# Specificity ( 1 - FPR)
spec <- tn/(tn+fp)

# Accuracy
acc <- (fp+tp)/(tp+tn+fp+fn)

# FPR (1- Specificity)
1 - (tn/(tn+fp))
## [1] 0.1192252
# FNR
fn/(fn+tp)
## [1] 0.5763889
#find AUC
pred <- prediction(predictions = d.test$Predicted_Injury, labels = d.test$Injuries)
roc.perf <- performance(pred, measure = "tpr", x.measure = "fpr")

plot(roc.perf)

auc.perf <- performance(pred, measure = "auc")
auc.perf@y.values
## [[1]]
## [1] 0.6521929